{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "36f38e2a-9998-4c73-bfc3-21b74d64a5ee",
   "metadata": {},
   "source": [
    "### End 2 End NLP Project\n",
    "+ Emotion Detection In Text \n",
    "+ Text Classifier"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "f0814628-3d83-4fd6-a511-2eccf79f9f1e",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load EDA Pkgs\n",
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "ea0d580d-c31c-44b7-b09b-10225857eebe",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Data Viz Pkgs\n",
    "import seaborn as sns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "91eccfbf-d4d0-4e16-b0f7-2d7941efddb0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Text Cleaning Pkgs\n",
    "import neattext.functions as nfx"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "21e7e868-35fb-483f-82b6-842a29ef1342",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load ML Pkgs\n",
    "# Estimators\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "\n",
    "# Transformers\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import accuracy_score,classification_report,confusion_matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "b209e004-ab77-4407-8689-b4318944d47f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load Dataset\n",
    "df = pd.read_csv(\"data/emotion_dataset_raw.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "fea2d4c0-3bdd-405e-ab69-507ceaac36cb",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Emotion</th>\n",
       "      <th>Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>neutral</td>\n",
       "      <td>Why ?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>joy</td>\n",
       "      <td>Sage Act upgrade on my to do list for tommorow.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sadness</td>\n",
       "      <td>ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>joy</td>\n",
       "      <td>Such an eye ! The true hazel eye-and so brill...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>joy</td>\n",
       "      <td>@Iluvmiasantos ugh babe.. hugggzzz for u .!  b...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   Emotion                                               Text\n",
       "0  neutral                                             Why ? \n",
       "1      joy    Sage Act upgrade on my to do list for tommorow.\n",
       "2  sadness  ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...\n",
       "3      joy   Such an eye ! The true hazel eye-and so brill...\n",
       "4      joy  @Iluvmiasantos ugh babe.. hugggzzz for u .!  b..."
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "430565a3-cf3b-4c6f-afa5-bafd084f5676",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "joy         11045\n",
       "sadness      6722\n",
       "fear         5410\n",
       "anger        4297\n",
       "surprise     4062\n",
       "neutral      2254\n",
       "disgust       856\n",
       "shame         146\n",
       "Name: Emotion, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Value Counts\n",
    "df['Emotion'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "531d3449-a959-4a19-bff0-3ffed551e619",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "<AxesSubplot:xlabel='Emotion', ylabel='count'>"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEGCAYAAABPdROvAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8QVMy6AAAACXBIWXMAAAsTAAALEwEAmpwYAAAZ1ElEQVR4nO3debgkdX3v8fdHcAERBBm5OEMconNVIHFhRHBFMco1GohKHCMC6r1zNajBJRGjj6I+KFGjEQ3c4Aa4IeICmiiSQSRRFoclDEsIE0GYOMK4IW7g4Pf+Ub8jzZk+h2bqnNMc5v16nn66+tdVv/pVnTr16Vr616kqJEnaWPcYdwMkSfObQSJJ6sUgkST1YpBIknoxSCRJvWw+7gbMte23374WL1487mZI0rxywQUX/LCqFgx7b5MLksWLF7Ny5cpxN0OS5pUk35vqPU9tSZJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ62eS+2T5fXfv2Pxh3EwD4vbesGncTJN3FeEQiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqZdZC5IkH0tyQ5JLB8q2S3JGkqva87YD770xyeokVyZ55kD57klWtfeOTpJWfu8kn23l5yVZPFvLIkma2mwekRwP7Dup7HBgRVUtAVa01yTZBVgG7NqmOSbJZm2aY4HlwJL2mKjzZcBPquqhwPuBv521JZEkTWnWgqSqzgZ+PKl4P+CENnwCsP9A+UlVdXNVXQ2sBvZIsiOwdVWdU1UFnDhpmom6TgH2mThakSTNnbm+RrJDVa0FaM8PbOULgesGxlvTyha24cnlt5umqtYDNwIPGDbTJMuTrEyyct26dTO0KJIkuOtcbB92JFHTlE83zYaFVcdV1dKqWrpgwYKNbKIkaZi5DpLr2+kq2vMNrXwNsNPAeIuA77fyRUPKbzdNks2BbdjwVJokaZbNdZCcBhzchg8GTh0oX9buxNqZ7qL6+e30101J9mzXPw6aNM1EXc8HzmzXUSRJc2jz2ao4yWeAvYHtk6wB3gocBZyc5GXAtcABAFV1WZKTgcuB9cChVXVrq+oVdHeAbQF8tT0APgp8IslquiORZbO1LJKkqc1akFTVC6d4a58pxj8SOHJI+UpgtyHlv6YFkSRpfO4qF9slSfOUQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvYwlSJK8JsllSS5N8pkk90myXZIzklzVnrcdGP+NSVYnuTLJMwfKd0+yqr13dJKMY3kkaVM250GSZCHwamBpVe0GbAYsAw4HVlTVEmBFe02SXdr7uwL7Asck2axVdyywHFjSHvvO4aJIkhjfqa3NgS2SbA5sCXwf2A84ob1/ArB/G94POKmqbq6qq4HVwB5JdgS2rqpzqqqAEwemkSTNkTkPkqr6b+C9wLXAWuDGqvo6sENVrW3jrAUe2CZZCFw3UMWaVrawDU8u30CS5UlWJlm5bt26mVwcSdrkjePU1rZ0Rxk7Aw8C7pvkwOkmGVJW05RvWFh1XFUtraqlCxYsuLNNliRNYxyntp4OXF1V66rqN8AXgMcD17fTVbTnG9r4a4CdBqZfRHcqbE0bnlwuSZpD4wiSa4E9k2zZ7rLaB7gCOA04uI1zMHBqGz4NWJbk3kl2pruofn47/XVTkj1bPQcNTCNJmiObz/UMq+q8JKcAFwLrgYuA44CtgJOTvIwubA5o41+W5GTg8jb+oVV1a6vuFcDxwBbAV9tDkjSH5jxIAKrqrcBbJxXfTHd0Mmz8I4Ejh5SvBHab8QZKkkbmN9slSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXsbyU7u6+3rCB58w7ibwrVd9a9xNkDYpHpFIknoxSCRJvRgkkqReDBJJUi8GiSSpl5GCJMmKUcokSZueaW//TXIfYEtg+yTbAmlvbQ08aJbbJkmaB+7oeyT/FziMLjQu4LYg+RnwD7PXLEnSfDFtkFTVB4APJHlVVX1wjtokSZpHRrpGUlUfTPL4JH+e5KCJx8bONMn9k5yS5D+SXJFkryTbJTkjyVXteduB8d+YZHWSK5M8c6B89ySr2ntHJ8nwOUqSZsuoF9s/AbwXeCLw2PZY2mO+HwC+VlUPBx4JXAEcDqyoqiXAivaaJLsAy4BdgX2BY5Js1uo5FlgOLGmPfXu0SZK0EUbta2spsEtVVd8ZJtkaeDJwCEBV3QLckmQ/YO822gnAWcAbgP2Ak6rqZuDqJKuBPZJcA2xdVee0ek8E9ge+2reNkqTRjfo9kkuB/zFD8/x9YB3w8SQXJflIkvsCO1TVWoD2/MA2/kLguoHp17SyhW14cvkGkixPsjLJynXr1s3QYkiSYPQg2R64PMnpSU6beGzkPDcHHgMcW1WPBn5BO401hWHXPWqa8g0Lq46rqqVVtXTBggV3tr2SpGmMemrriBmc5xpgTVWd116fQhck1yfZsarWJtkRuGFg/J0Gpl8EfL+VLxpSLkmaQyMFSVV9c6ZmWFU/SHJdkodV1ZXAPsDl7XEwcFR7PrVNchrw6STvo/s+yxLg/Kq6NclNSfYEzgMOArxFWZLm2EhBkuQmbjttdC/gnsAvqmrrjZzvq4BPJbkX8F3gJXSn2U5O8jLgWuAAgKq6LMnJdEGzHji0qm5t9bwCOB7Ygu4iuxfaJWmOjXpEcr/B10n2B/bY2JlW1cUMv314nynGPxI4ckj5SmC3jW2HJKm/jer9t6q+BDxtZpsiSZqPRj219dyBl/egO5ro/Z0SSdL8N+pdW88ZGF4PXEP3RUFJ0iZu1GskL5nthkiS5qdR+9palOSLSW5Icn2SzydZdMdTSpLu7kY9tfVx4NO0W3KBA1vZH81Go6TZ9s0nP2XcTeApZ8/Y17OksRr1rq0FVfXxqlrfHscD9jUiSRo5SH6Y5MAkm7XHgcCPZrNhkqT5YdQgeSnwZ8APgLXA8+m+jS5J2sSNeo3kHcDBVfUTgCTb0f3Q1Utnq2GSpPlh1COSP5wIEYCq+jHw6NlpkiRpPhk1SO4x6TfUt2P0oxlJ0t3YqGHwd8C3k5xC1zXKnzGkE0VJ0qZn1G+2n5hkJV1HjQGeW1WXz2rLJEnzwsinp1pwGB6SpNvZqG7kJUmaYJBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvdgVvKTejjzw+eNuAm/65CnjbsImyyMSSVIvBokkqZexBUmSzZJclOQr7fV2Sc5IclV7HvxFxjcmWZ3kyiTPHCjfPcmq9t7RSTKOZZGkTdk4j0j+Erhi4PXhwIqqWgKsaK9JsguwDNgV2Bc4JslmbZpjgeXAkvbYd26aLkmaMJaL7UkWAX9M93O9r23F+wF7t+ETgLOAN7Tyk6rqZuDqJKuBPZJcA2xdVee0Ok8E9ge+OicLIc2BD73uy+NuAq/8u+eMuwm6ixvXEcnfA38N/HagbIeqWgvQnh/YyhcC1w2Mt6aVLWzDk8s3kGR5kpVJVq5bt25GFkCS1JnzIEnybOCGqrpg1EmGlNU05RsWVh1XVUuraumCBQtGnK0kaRTjOLX1BOBPkjwLuA+wdZJPAtcn2bGq1ibZEbihjb8G2Glg+kXA91v5oiHlkqQ5NOdHJFX1xqpaVFWL6S6in1lVBwKnAQe30Q4GTm3DpwHLktw7yc50F9XPb6e/bkqyZ7tb66CBaSRJc+Su9M32o4CTk7wMuBY4AKCqLktyMnA5sB44tKpubdO8Ajge2ILuIrsX2iVpjo01SKrqLLq7s6iqHwH7TDHekXR3eE0uXwnsNnstlCTdEb/ZLknqxSCRJPVikEiSejFIJEm9GCSSpF4MEklSLwaJJKkXg0SS1ItBIknqxSCRJPVikEiSejFIJEm9GCSSpF4MEklSLwaJJKkXg0SS1ItBIknqxSCRJPVikEiSejFIJEm9GCSSpF4MEklSLwaJJKkXg0SS1ItBIknqxSCRJPVikEiSetl83A0Yt93/6sRxNwGAC95z0LibIEkbZc6PSJLslOQbSa5IclmSv2zl2yU5I8lV7XnbgWnemGR1kiuTPHOgfPckq9p7RyfJXC+PJG3qxnFqaz3wuqp6BLAncGiSXYDDgRVVtQRY0V7T3lsG7ArsCxyTZLNW17HAcmBJe+w7lwsiSRpDkFTV2qq6sA3fBFwBLAT2A05oo50A7N+G9wNOqqqbq+pqYDWwR5Idga2r6pyqKuDEgWkkSXNkrBfbkywGHg2cB+xQVWuhCxvggW20hcB1A5OtaWUL2/Dk8mHzWZ5kZZKV69atm9FlkKRN3diCJMlWwOeBw6rqZ9ONOqSspinfsLDquKpaWlVLFyxYcOcbK0ma0liCJMk96ULkU1X1hVZ8fTtdRXu+oZWvAXYamHwR8P1WvmhIuSRpDo3jrq0AHwWuqKr3Dbx1GnBwGz4YOHWgfFmSeyfZme6i+vnt9NdNSfZsdR40MI0kaY6M43skTwBeDKxKcnEr+xvgKODkJC8DrgUOAKiqy5KcDFxOd8fXoVV1a5vuFcDxwBbAV9tDkjSH5jxIqurfGH59A2CfKaY5EjhySPlKYLeZa50k6c6yixRJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqReDBJJUi8GiSSpF4NEktSLQSJJ6sUgkST1YpBIknoxSCRJvRgkkqRexvELiZKkKRxxxBHjbgJw59rhEYkkqReDRJLUi0EiSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqReDRJLUi0EiSerFIJEk9TLvO21Msi/wAWAz4CNVddSYmyTpLuiKI88cdxMAeMSbnjbuJsy4eX1EkmQz4B+A/wXsArwwyS7jbZUkbVrmdZAAewCrq+q7VXULcBKw35jbJEmblFTVuNuw0ZI8H9i3qv53e/1i4HFV9cpJ4y0HlreXDwOunOGmbA/8cIbrnA22c2bNh3bOhzaC7Zxps9HOB1fVgmFvzPdrJBlStkEyVtVxwHGz1ohkZVUtna36Z4rtnFnzoZ3zoY1gO2faXLdzvp/aWgPsNPB6EfD9MbVFkjZJ8z1IvgMsSbJzknsBy4DTxtwmSdqkzOtTW1W1PskrgdPpbv/9WFVdNoamzNppsxlmO2fWfGjnfGgj2M6ZNqftnNcX2yVJ4zffT21JksbMIJEk9WKQzJAki5P8+UZO+/MZbsu3Z7K+2dLW2aXjbscwSV6d5Ioknxp3W2ZLkn9Ocv9xt2PcklyTZPtxt2OYJEckeX2Styd5+hzMb/+N6R3EIJk5i4GhQZJkTm9qqKrHz+X87qb+AnhWVb1oYytoXfjMmVG3s3TuUVXPqqqfznKzZs3Ecoy7HXOhqt5SVf8yB7Pan667qTunqjbpB10AXAF8GLgM+DqwBfAQ4GvABcC/Ag9v4x8PPH9g+p+353OBG4GLgdcAhwCfA74MnAlsBawALgRWAftNrmMGl+nndF/WfA9waZvfC9p7n5g0708Bf9JzfvcF/gn49za/FwBvobs9+1K6O0gmbuzYvY13zkT7WvkhwBfaOr8KePdA/c9o41/Y1ulWrfwo4HLgEuC9reyANs9/B87eyOX5f8Atbb29CfhYW5aLJtZd227+tbXpQuDxrXxv4BvAp4HLZ3B9XgNs395fCpzVho9o6/frbZ6HAKe29Xgl8NZJ2/kxbTkePFHnsPkN/K2+Sfc/cDqw44jt/1Kb5jJg+cA2eWSbx7nADq38Ie31d4C3M/C/APxVK78EeNtUyzFD6/Zt3Pa/OfG/vgfw7TafbwMPG9hWv0T3v3018ErgtW28c4HtBpZtg33ICG18U/vb/QvwGeD1DOx3GL7dD12PdNvjVwbq/hBwyLB6gMcDP27LdDHwkJHXa58dyN3h0TbM9cCj2uuTgQPpdvpLWtnjgDPb8O/+oBP/IFP8wQ6h+8LkxEa1ObB1G94eWM1tO9fZCJLnAWfQ3Ra9A3AtsCPwFOBLbbxt2kazec/5PQ/48MDrbSaWu73+BPCcNnwJ8JQ2PDlIvtumvQ/wPbovm24PnA3ct433BrqQ2q79s02sw/u351XAwsGyjVyma9q83wkcOFEf8J90O6Mtgfu08iXAyoHt4BfAzjO8Pq9h6iC5ANhiYD2uBR5A94Ho0jb+YuC3wJ5DlnHY/O5Jt/Nc0MpeQHd7/Sjtn9jmJ+b/ALoeJya2gXcDb27DXwFe2IZfzm3/T8+gfQChO3PyFeDJw5Zjhtbtq9rrv6DrRRxga9r/BvB04PMD63g1cD9gAd0HyJe3994PHNaGh+5D7qB9u9Ntw1u2+a9mIEiYerufaj3uzZAgmaae4xnYv4362CQOC0dwdVVd3IYvoNtYHw98LsnFwD/S7YTvrDOq6sdtOMA7k1xC90ljId0OfrY8EfhMVd1aVdfTfbJ8bFV9E3hokgcCL6T751jfc16rgKcn+dskT6qqG4GnJjkvySrgacCuSbah22C/2ab7xKR6VlTVjVX1a7pPSg8G9qQ71P5W+1sc3Mp/Bvwa+EiS5wK/bHV8Czg+yf+hC9G+ngEc3uZ9Fl3I/R7djvbDbfk+x+1PB5xfVVf3mOew9Tmd06rqVwOvz6iqH7WyL9BtCwDfq6pzR5zfw4DdgDPasr+ZrueIUbw6ycSRx050QXsL3c4ObvsfA9iLbv1Bd0Q14RntcRHdkcLDWz3TLccoplq3XxjStm3o9gGX0gXErgP1fKOqbqqqdXRB8uWB+hcn2YqN24c8CfhiVf2yqn7Ghl+wnmq7n2o9TmWqejbKvP5C4gy6eWD4Vrod/E+r6lFDxl1Pu7aUJMC9pqn3FwPDL6L79LJ7Vf0myTV0O6XZMqwfsgmfaO1ZBry074yq6j+T7A48C3hXkq8DhwJLq+q6JEfQLWsY0hfagMl/h83bNGdU1Qsnj5xkD2CfthyvBJ5WVS9P8jjgj4GLkzyqqn7UY/ECPK+qbtfRZ1um64FH0m0Pvx54e/DvfqdNsT5/t92x4XYzeX6T13FNMd508/sicFlV7XVn2p5kb7pP73tV1S+TnNXa+5tqH3m57W87bVXAu6rqHyfVv3iq5RjFFMsKt217g217B11g/Gmb71kDVQ1uq78deP3bNv09mHofcofNnKb964dt99PUNbjdQNt2NqKeaXlEMtzPgKuTHAC/u6j3yPbeNXSHn9B1WX/PNnwT3aHuVLYBbmgh8lS6T9Wz6WzgBUk2S7KA7rTA+e2944HDAGoGegJI8iDgl1X1SbpzrY9pb/2wfTJ7fpvXT4Ebk0x8Qh7lQva5wBOSPLTNa8sk/7PVu01V/XNblke19x9SVedV1Vvoej/daXi1IzsdeFX70ECSR7fybYC1VfVb4MXMzNEPbR7D1uc13LbdPe8OqvijJNsl2YLu4um3NmJ+VwILkuzVxrlnkl2nqWbCNsBPWog8nO6IcjrnctvyLBsoPx14afs7k2RhO4ruZZptdZhtgP9uw4fcmfm0o4mp9iHTORv40yRbJLkf8JxJ7R+63TP1evwesEuSe7czAvvcQT13tB8byiOSqb0IODbJm+nC4iS6C3QfBk5Ncj7dOdCJT0eXAOvbIf3xwE8m1fcp4MtJVtJdyPqPWWx70X2i3Ku1uYC/rqofAFTV9UmuoLtgOBP+AHhPkt8CvwFeQbcDW0W3A/zOwLgvAT6W5Jd0O4vpF6RqXZJDgM8kuXcrfjPdBn9qkokjnde0996TZEkrW0G3/H28A/h74JIWJtcAz6a72Pv5tqP4Bj2PQiYZtj63AD6a5G+A8+5g+n+jO+p8KPDpqlrZPlGPPL+quiXdzzQc3XZAm9Othzv64PE14OXtFO6VdDu46RwGfDLJ6+gugt8IUFVfT/II4JyW4T+nu3Z56x3Ud0eGrdtTphj33cAJSV5Ld8PMnTXVPmRKVXVhks/S7SO+R3eRftD9GL7dH8bw9XhdkpPp9k9X0Z0qnK6ek+hO2b6a7lrJf42yoHaRcjeT5AHAhVU15RFPki3pdvKPGeH8u+aRFrpLa9Jv8txVtW3xV1VVSZbRXTDeb9ztmm/GvR49IrkbaYftZ9Edsk81ztPpbmd9nyGiu4DdgQ+1o72fMgPX7DZRY12PHpFIknrxYrskqReDRJLUi0EiSerFIJF6SHJrkosHHofPQJ2360k6ydIkR/etV5otXmyXekjy86raaobr3Bt4fVU9eybrlWaLRyTSLEj3GxfvTHJOkpVJHpPk9CT/leTlbZwkeU+SS5OsSvKCNvlRwJPaEc5rkuyd5Cttmu2SfCnJJUnOTfKHrfyIJB9LclaS77YvlElzwu+RSP1ska5TvgnvqqrPtuHrqmqvJO+n6+3gCXR9HV1G11X9c+m6pngkXS+830lyNnA4A0ck7QhlwtuAi6pq/yRPA07ktu4tHg48le5by1cmObaqfjOTCysNY5BI/fxqmo75JnpuXUX3Gyo3ATcl+XW6Xyb8XQ/NwPVJvgk8lq6vt6k8kdanUlWdmeQBrQsTgH+qqpuBm5PcQNf56JoeyyaNxFNb0uwZ7BF2cm+xEz0b31nDppm40Dms92Rp1hkk0vhM1UPzdD2wnk3rNbmd8vph62lWGhs/sUj9TL5G8rWqGvUW4KE9NCf5EbfvSfqigWmOAD7eetf9Jd0PfUlj5e2/kqRePLUlSerFIJEk9WKQSJJ6MUgkSb0YJJKkXgwSSVIvBokkqZf/D8jGbTFp2Y+OAAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# Plot\n",
    "sns.countplot(x='Emotion',data=df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "40f991d0-952f-40c1-bf00-f3476ce0436d",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['BTC_ADDRESS_REGEX',\n",
       " 'CURRENCY_REGEX',\n",
       " 'CURRENCY_SYMB_REGEX',\n",
       " 'Counter',\n",
       " 'DATE_REGEX',\n",
       " 'EMAIL_REGEX',\n",
       " 'EMOJI_REGEX',\n",
       " 'HASTAG_REGEX',\n",
       " 'MASTERCard_REGEX',\n",
       " 'MD5_SHA_REGEX',\n",
       " 'MOST_COMMON_PUNCT_REGEX',\n",
       " 'NUMBERS_REGEX',\n",
       " 'PHONE_REGEX',\n",
       " 'PoBOX_REGEX',\n",
       " 'SPECIAL_CHARACTERS_REGEX',\n",
       " 'STOPWORDS',\n",
       " 'STOPWORDS_de',\n",
       " 'STOPWORDS_en',\n",
       " 'STOPWORDS_es',\n",
       " 'STOPWORDS_fr',\n",
       " 'STOPWORDS_ru',\n",
       " 'STOPWORDS_yo',\n",
       " 'STREET_ADDRESS_REGEX',\n",
       " 'TextFrame',\n",
       " 'URL_PATTERN',\n",
       " 'USER_HANDLES_REGEX',\n",
       " 'VISACard_REGEX',\n",
       " '__builtins__',\n",
       " '__cached__',\n",
       " '__doc__',\n",
       " '__file__',\n",
       " '__generate_text',\n",
       " '__loader__',\n",
       " '__name__',\n",
       " '__numbers_dict',\n",
       " '__package__',\n",
       " '__spec__',\n",
       " '_lex_richness_herdan',\n",
       " '_lex_richness_maas_ttr',\n",
       " 'clean_text',\n",
       " 'defaultdict',\n",
       " 'digit2words',\n",
       " 'extract_btc_address',\n",
       " 'extract_currencies',\n",
       " 'extract_currency_symbols',\n",
       " 'extract_dates',\n",
       " 'extract_emails',\n",
       " 'extract_emojis',\n",
       " 'extract_hashtags',\n",
       " 'extract_html_tags',\n",
       " 'extract_mastercard_addr',\n",
       " 'extract_md5sha',\n",
       " 'extract_numbers',\n",
       " 'extract_pattern',\n",
       " 'extract_phone_numbers',\n",
       " 'extract_postoffice_box',\n",
       " 'extract_shortwords',\n",
       " 'extract_special_characters',\n",
       " 'extract_stopwords',\n",
       " 'extract_street_address',\n",
       " 'extract_urls',\n",
       " 'extract_userhandles',\n",
       " 'extract_visacard_addr',\n",
       " 'fix_contractions',\n",
       " 'generate_sentence',\n",
       " 'hamming_distance',\n",
       " 'inverse_df',\n",
       " 'lexical_richness',\n",
       " 'markov_chain',\n",
       " 'math',\n",
       " 'nlargest',\n",
       " 'normalize',\n",
       " 'num2words',\n",
       " 'random',\n",
       " 're',\n",
       " 'read_txt',\n",
       " 'remove_bad_quotes',\n",
       " 'remove_btc_address',\n",
       " 'remove_currencies',\n",
       " 'remove_currency_symbols',\n",
       " 'remove_custom_pattern',\n",
       " 'remove_custom_words',\n",
       " 'remove_dates',\n",
       " 'remove_emails',\n",
       " 'remove_emojis',\n",
       " 'remove_hashtags',\n",
       " 'remove_html_tags',\n",
       " 'remove_mastercard_addr',\n",
       " 'remove_md5sha',\n",
       " 'remove_multiple_spaces',\n",
       " 'remove_non_ascii',\n",
       " 'remove_numbers',\n",
       " 'remove_phone_numbers',\n",
       " 'remove_postoffice_box',\n",
       " 'remove_puncts',\n",
       " 'remove_punctuations',\n",
       " 'remove_shortwords',\n",
       " 'remove_special_characters',\n",
       " 'remove_stopwords',\n",
       " 'remove_street_address',\n",
       " 'remove_urls',\n",
       " 'remove_userhandles',\n",
       " 'remove_visacard_addr',\n",
       " 'replace_bad_quotes',\n",
       " 'replace_currencies',\n",
       " 'replace_currency_symbols',\n",
       " 'replace_dates',\n",
       " 'replace_emails',\n",
       " 'replace_emojis',\n",
       " 'replace_numbers',\n",
       " 'replace_phone_numbers',\n",
       " 'replace_special_characters',\n",
       " 'replace_term',\n",
       " 'replace_urls',\n",
       " 'string',\n",
       " 'term_freq',\n",
       " 'to_txt',\n",
       " 'word_freq',\n",
       " 'word_length_freq']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Data Cleaning\n",
    "dir(nfx)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "b1f87847-a91c-4bd6-a307-d746eb5aa9a0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# User handles\n",
    "df['Clean_Text'] = df['Text'].apply(nfx.remove_userhandles)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "id": "03886bc3-1ac4-4f1b-842b-e5d2d770ff81",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Stopwords\n",
    "df['Clean_Text'] = df['Clean_Text'].apply(nfx.remove_stopwords)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "id": "0a0fcc0c-4adf-4f0b-b226-164659ad70ba",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Emotion</th>\n",
       "      <th>Text</th>\n",
       "      <th>Clean_Text</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>neutral</td>\n",
       "      <td>Why ?</td>\n",
       "      <td>?</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>joy</td>\n",
       "      <td>Sage Act upgrade on my to do list for tommorow.</td>\n",
       "      <td>Sage Act upgrade list tommorow.</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>sadness</td>\n",
       "      <td>ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...</td>\n",
       "      <td>WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>joy</td>\n",
       "      <td>Such an eye ! The true hazel eye-and so brill...</td>\n",
       "      <td>eye ! true hazel eye-and brilliant ! Regular f...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>joy</td>\n",
       "      <td>@Iluvmiasantos ugh babe.. hugggzzz for u .!  b...</td>\n",
       "      <td>ugh babe.. hugggzzz u .! babe naamazed nga ako...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34787</th>\n",
       "      <td>surprise</td>\n",
       "      <td>@MichelGW have you gift! Hope you like it! It'...</td>\n",
       "      <td>gift! Hope like it! hand wear ! It'll warm! Lol</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34788</th>\n",
       "      <td>joy</td>\n",
       "      <td>The world didnt give it to me..so the world MO...</td>\n",
       "      <td>world didnt me..so world DEFINITELY cnt away!!!</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34789</th>\n",
       "      <td>anger</td>\n",
       "      <td>A man robbed me today .</td>\n",
       "      <td>man robbed today .</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34790</th>\n",
       "      <td>fear</td>\n",
       "      <td>Youu call it JEALOUSY, I call it of #Losing YO...</td>\n",
       "      <td>Youu JEALOUSY, #Losing YOU...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>34791</th>\n",
       "      <td>sadness</td>\n",
       "      <td>I think about you baby, and I dream about you ...</td>\n",
       "      <td>think baby, dream time</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>34792 rows × 3 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "        Emotion                                               Text  \\\n",
       "0       neutral                                             Why ?    \n",
       "1           joy    Sage Act upgrade on my to do list for tommorow.   \n",
       "2       sadness  ON THE WAY TO MY HOMEGIRL BABY FUNERAL!!! MAN ...   \n",
       "3           joy   Such an eye ! The true hazel eye-and so brill...   \n",
       "4           joy  @Iluvmiasantos ugh babe.. hugggzzz for u .!  b...   \n",
       "...         ...                                                ...   \n",
       "34787  surprise  @MichelGW have you gift! Hope you like it! It'...   \n",
       "34788       joy  The world didnt give it to me..so the world MO...   \n",
       "34789     anger                           A man robbed me today .    \n",
       "34790      fear  Youu call it JEALOUSY, I call it of #Losing YO...   \n",
       "34791   sadness  I think about you baby, and I dream about you ...   \n",
       "\n",
       "                                              Clean_Text  \n",
       "0                                                      ?  \n",
       "1                        Sage Act upgrade list tommorow.  \n",
       "2      WAY HOMEGIRL BABY FUNERAL!!! MAN HATE FUNERALS...  \n",
       "3      eye ! true hazel eye-and brilliant ! Regular f...  \n",
       "4      ugh babe.. hugggzzz u .! babe naamazed nga ako...  \n",
       "...                                                  ...  \n",
       "34787    gift! Hope like it! hand wear ! It'll warm! Lol  \n",
       "34788    world didnt me..so world DEFINITELY cnt away!!!  \n",
       "34789                                 man robbed today .  \n",
       "34790                      Youu JEALOUSY, #Losing YOU...  \n",
       "34791                             think baby, dream time  \n",
       "\n",
       "[34792 rows x 3 columns]"
      ]
     },
     "execution_count": 19,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "id": "450c39c0-79dd-4eaf-85fe-57e344eb81bd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Features & Labels\n",
    "Xfeatures = df['Clean_Text']\n",
    "ylabels = df['Emotion']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "27d7f976-c28f-449e-ae1a-53a42bbda4e8",
   "metadata": {},
   "outputs": [],
   "source": [
    "#  Split Data\n",
    "x_train,x_test,y_train,y_test = train_test_split(Xfeatures,ylabels,test_size=0.3,random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "2f086f29-dba9-40d2-a9dd-f06a6cca3a4c",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Build Pipeline\n",
    "from sklearn.pipeline import Pipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "6b81cc86-2bef-40c2-b9a3-668caaadaff0",
   "metadata": {},
   "outputs": [],
   "source": [
    "# LogisticRegression Pipeline\n",
    "pipe_lr = Pipeline(steps=[('cv',CountVectorizer()),('lr',LogisticRegression())])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "dc64b9a7-efe2-4bc4-a0e7-46dff1d52b31",
   "metadata": {
    "collapsed": true,
    "jupyter": {
     "outputs_hidden": true
    },
    "tags": []
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.9/dist-packages/sklearn/linear_model/_logistic.py:763: ConvergenceWarning: lbfgs failed to converge (status=1):\n",
      "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n",
      "\n",
      "Increase the number of iterations (max_iter) or scale the data as shown in:\n",
      "    https://scikit-learn.org/stable/modules/preprocessing.html\n",
      "Please also refer to the documentation for alternative solver options:\n",
      "    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n",
      "  n_iter_i = _check_optimize_result(\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Train and Fit Data\n",
    "pipe_lr.fit(x_train,y_train)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "135ed6f8-56ff-4d53-85e3-541e3a7ae2d7",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Pipeline(steps=[('cv', CountVectorizer()), ('lr', LogisticRegression())])"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe_lr"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "28396371-5f5c-4a3b-b974-164e047764f3",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "0.6200421536692853"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Check Accuracy\n",
    "pipe_lr.score(x_test,y_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "eb3a26b6-d09e-422f-991b-b08c48f55b75",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Make A Prediction\n",
    "ex1 = \"This book was so interesting it made me happy\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "b08597d9-6f59-45cb-a648-95b0da1ce313",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['joy'], dtype=object)"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pipe_lr.predict([ex1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "id": "5b3822ac-17fc-43dd-9bb7-8dad07a4d32c",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[1.60353503e-03, 7.05960421e-03, 6.95963589e-03, 9.43781635e-01,\n",
       "        1.00430913e-04, 2.63557471e-02, 6.65377751e-05, 1.40728742e-02]])"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Prediction Prob\n",
    "pipe_lr.predict_proba([ex1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "id": "5b7c4596-d643-48e5-a777-79a6f55c49da",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array(['anger', 'disgust', 'fear', 'joy', 'neutral', 'sadness', 'shame',\n",
       "       'surprise'], dtype=object)"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# To Know the classes\n",
    "pipe_lr.classes_"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "id": "c0d40f62-b1fd-4748-a279-c8f50c748f26",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save Model & Pipeline\n",
    "import joblib\n",
    "pipeline_file = open(\"emotion_classifier_pipe_lr_03_june_2021.pkl\",\"wb\")\n",
    "joblib.dump(pipe_lr,pipeline_file)\n",
    "pipeline_file.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "377c4e98-67f0-45e5-8dd5-0417585754f0",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.17"
  },
  "vscode": {
   "interpreter": {
    "hash": "de038e0191e253dd4a6eabdecdf03101b15acd2fb359f0c7300c84702d7f1bc8"
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
